package com.topsmob.processer;

import com.topsmob.StartApp;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.example.AppStore;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.XpathSelector;

import java.util.ArrayList;
import java.util.List;

/**
 * com.topsmob.tool
 * Description:我的异常网爬虫
 * User:李耀华
 * Date:2017-06-22.16:20
 */
public class MyExceptionProcesser implements PageProcessor {
    public String model = "ajax";
    String channelUrl = "http://www.myexception.cn/"+model+"/";
    public void process(Page page) {
        if(page.getUrl().toString().contains("http://www.myexception.cn/"+model+"/index")){
            if(page.getUrl().toString().replace("http://www.myexception.cn/"+model+"/index","")
                    .replace(".html","")
                    .replace("_","").equals("")){

                //发现所有的列表页
                 String end = page.getHtml().css("div.c_p_s>ul>font>li:last-child>a:last-child","href").toString();
                if(StringUtils.isNotEmpty(end)){
                    int end_num = Integer.parseInt(end.replace("index_","").replace(".html",""));
                    List<String> pageList = new ArrayList<String>();
                    for(int i=2;i<=end_num;i++){
                        pageList.add(channelUrl+"index_"+i+".html");
                       // System.out.println("发现Url"+channelUrl+"index_"+i+".html");
                    }
                    page.addTargetRequests(pageList);
                }
            }
            //发现文章
            List<String> links = page.getHtml().css("div.c_c>ul>li>a","href").all();

            page.addTargetRequests(links);
            page.setSkip(true);

        }else{//文章页

            String title = page.getHtml().xpath("//div[@class=\"c_t\"]/h1/text()").toString();
            //System.out.println(title);
            page.putField("title",title);
            String content = page.getHtml().xpath("//div[@class=\"c_txt\"]").toString();

            page.putField("content",content);
            page.putField("postId",page.getUrl().replace("http://www.myexception.cn/"+model+"/","")
            .replace(".html",""));
        }

    }

    public Site getSite() {
        return  Site.me().setRetryTimes(3).setTimeOut(30*1000).setDomain(StartApp.siteUrl).setCharset("utf-8");
    }
}
